探索性資料分析 (Exploratory Data Analysis)
# ggplot 起手式
ggplot(data, aes(x=a, y=b, ...)) + geom_xxx()
# 客製化 ggplot2 佈景主題
thm <- function() {
theme_gray(base_family = "STHeiti") + # 讓Mac使用者能夠顯示中文, Windows使用者應省略這行
theme(text=element_text(size=18)) # 將字體調整至18號
}
df1 <- group_by(raw, Date) %>%
summarise(Sales_Value=sum(Sales_Value), Sales_Number=sum(Sales_Number))
ggplot(df1, aes(x=Date, y=Sales_Number)) + geom_point()

ggplot(df1, aes(x=Date, y=Sales_Number)) + geom_point(size=4) +
labs(x="日期", y="銷售次數") + thm() +
scale_y_continuous(labels=comma) +
scale_x_date(labels=date_format("%m"), breaks = date_breaks("1 month"))

ggplot(df1, aes(x=Date, y=Sales_Value)) + geom_point(size=4) +
labs(x="日期", y="銷售金額") + thm() +
scale_y_continuous(labels=comma) +
scale_x_date(labels=date_format("%m"), breaks = date_breaks("1 month"))

ggplot(df1, aes(x=Sales_Number, y=Sales_Value)) + geom_point(size=4) +
labs(x="銷售次數", y="銷售金額") + thm() +
scale_x_continuous(labels=comma) +
scale_y_continuous(labels=comma)

加入額外的變數作圖
- 日期 vs. 銷售次數 vs. 週間
- 日期 vs. 銷售金額 vs. 週間
- 銷售次數 vs. 銷售金額
df1 <- mutate(df1, is.weekday=strftime(Date, "%u")<6)
ggplot(df1, aes(x=Date, y=Sales_Number, colour=is.weekday)) + geom_point(size=4) +
labs(x="日期", y="銷售次數") + thm() +
scale_y_continuous(labels=comma) +
scale_x_date(labels=date_format("%m"), breaks = date_breaks("1 month"))

ggplot(df1, aes(x=Date, y=Sales_Value, colour=is.weekday)) + geom_point(size=4) +
labs(x="日期", y="銷售金額") + thm() +
scale_y_continuous(labels=comma) +
scale_x_date(labels=date_format("%m"), breaks = date_breaks("1 month"))

ggplot(df1, aes(x=Sales_Number, y=Sales_Value, colour=is.weekday)) + geom_point(size=4) +
labs(x="銷售次數", y="銷售金額") + thm() +
scale_x_continuous(labels=comma) +
scale_y_continuous(labels=comma)

- 日期 vs. 銷售金額 vs. 節日
- 日期 vs. 銷售金額 vs. 颱風
- 日期 vs. 銷售金額 vs. 雨量
str(sup)
'data.frame': 1095 obs. of 8 variables:
$ Date : chr "01/01/2009" "01/02/2009" "01/03/2009" "01/04/2009" ...
$ Lunar_Date: chr "農曆臘月初六日" "農曆臘月初七日" "農曆臘月初八日" "農曆臘月初九日" ...
$ Wyear : chr "W1" "W1" "W1" "W1" ...
$ Wday : chr "星期四" "星期五" "星期六" "星期日" ...
$ Store : chr "H1" "H1" "H1" "H1" ...
$ is.bigday : chr "0" "0" "0" "0" ...
$ Rainfall : chr "0.2" "0" "0" "8.5" ...
$ is.typhoon: chr "0" "0" "0" "0" ...
# 將字串型的變數轉成適當的資料型態
sup <- mutate(sup,
Date=as.Date(Date, format="%m/%d/%Y"),
is.bigday=as.logical(as.integer(is.bigday)),
is.typhoon=as.logical(as.integer(is.typhoon)),
is.weekday=strftime(Date, format="%u")<6,
Rainfall=as.numeric(Rainfall)
)
df2 <- left_join(raw, sup, by=c("Date", "Store")) %>%
group_by(Date, is.weekday, is.bigday, is.typhoon, Rainfall) %>%
summarise(Sales_Value=sum(Sales_Value), Sales_Number=sum(Sales_Number))
ggplot(df2, aes(x=Date, y=Sales_Value, colour=is.bigday)) + geom_point(size=4) +
labs(x="日期", y="銷售金額") + thm() +
scale_y_continuous(labels=comma) +
scale_x_date(labels=date_format("%m"), breaks = date_breaks("1 month"))

ggplot(df2, aes(x=Date, y=Sales_Value, colour=is.typhoon)) + geom_point(size=4) +
labs(x="日期", y="銷售金額") + thm() +
scale_y_continuous(labels=comma) +
scale_x_date(labels=date_format("%m"), breaks = date_breaks("1 month"))

ggplot(df2, aes(x=Date, y=Sales_Value, colour=Rainfall)) +
geom_point(size=4) +
labs(x="日期", y="銷售金額") + thm() +
scale_y_continuous(labels=comma) +
scale_x_date(labels=date_format("%m"), breaks = date_breaks("1 month")) +
scale_colour_gradient2(low="#99FF00", mid="#81DB5A", high="#287AA9", midpoint = 20)

df2 <- mutate(df2, Rain_Lv=cut(Rainfall,
breaks=c(0, 3, 50 ,130, 200, Inf),
labels=c("無雨","小雨","大雨","豪雨","大豪雨"),
right=FALSE))
ggplot(df2, aes(x=Date, y=Sales_Value, colour=Rain_Lv)) +
geom_point(size=4) +
labs(x="日期", y="銷售金額") + thm() +
scale_y_continuous(labels=comma) +
scale_x_date(labels=date_format("%m"), breaks = date_breaks("1 month")) +
scale_colour_brewer(palette = "Set2")

ggplot(df2, aes(x=Sales_Number, y=Sales_Value)) +
geom_point(aes(colour=is.weekday, shape=is.bigday), size=4) +
labs(x="銷售次數", y="銷售金額") + thm() +
scale_x_continuous(labels=comma) +
scale_y_continuous(labels=comma)

df2 <- mutate(df2, group=factor(paste(is.weekday, is.bigday, sep="-"),
levels=c("FALSE-FALSE", "FALSE-TRUE",
"TRUE-FALSE", "TRUE-TRUE"),
labels=c("週間,非節日", "週間,節日",
"非週間,非節日", "非週間,節日")))
ggplot(df2, aes(x=Sales_Number, y=Sales_Value)) +
geom_point(aes(colour=group), size=4) +
labs(x="銷售次數", y="銷售金額") + thm() +
scale_x_continuous(labels=comma) +
scale_y_continuous(labels=comma)

建立線性模型
fit1 <- lm(Sales_Value ~ Sales_Number, data=df2)
summary(fit1)
Call:
lm(formula = Sales_Value ~ Sales_Number, data = df2)
Residuals:
Min 1Q Median 3Q Max
-1782242 -138676 -33780 109120 1884361
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -1.066e+05 4.469e+04 -2.386 0.0175 *
Sales_Number 1.401e+02 1.522e+00 92.021 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 359200 on 363 degrees of freedom
Multiple R-squared: 0.9589, Adjusted R-squared: 0.9588
F-statistic: 8468 on 1 and 363 DF, p-value: < 2.2e-16
fit2 <- lm(Sales_Value ~ Sales_Number + is.weekday + is.bigday
+ is.weekday:is.bigday, data=df2)
summary(fit2)
Call:
lm(formula = Sales_Value ~ Sales_Number + is.weekday + is.bigday +
is.weekday:is.bigday, data = df2)
Residuals:
Min 1Q Median 3Q Max
-1856027 -127133 -10140 110408 1535202
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -3.510e+05 8.611e+04 -4.076 5.64e-05 ***
Sales_Number 1.414e+02 2.212e+00 63.925 < 2e-16 ***
is.weekdayTRUE 2.437e+05 5.320e+04 4.580 6.41e-06 ***
is.bigdayTRUE 3.981e+05 1.165e+05 3.417 0.000705 ***
is.weekdayTRUE:is.bigdayTRUE -1.344e+05 1.234e+05 -1.089 0.276952
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 331800 on 360 degrees of freedom
Multiple R-squared: 0.9652, Adjusted R-squared: 0.9648
F-statistic: 2498 on 4 and 360 DF, p-value: < 2.2e-16
out1 <- data.frame(x=df2$Sales_Number, y=predict(fit1))
out2 <- data.frame(x=df2$Sales_Number, y=predict(fit2))
ggplot(df2, aes(x=Sales_Number, y=Sales_Value)) + geom_point(size=4) +
geom_line(aes(x=x,y=y), data=out1, col=2, size=1) +
geom_point(aes(x=x,y=y), data=out2, col=4, size=2) +
labs(x="銷售次數", y="銷售金額") + thm() +
scale_x_continuous(labels=comma) +
scale_y_continuous(labels=comma)

模型診斷
# Total sum of squared error
sum((df2$Sales_Value - out1$y)^2)
[1] 4.684006e+13
# R-squared
1 - sum((df2$Sales_Value - out1$y)^2) / sum((df2$Sales_Value - mean(df2$Sales_Value))^2)
[1] 0.9588946
sum((df2$Sales_Value - out2$y)^2)
[1] 3.963419e+13
1 - sum((df2$Sales_Value - out2$y)^2) / sum((df2$Sales_Value - mean(df2$Sales_Value))^2)
[1] 0.9652182
anova(fit1, fit2)
Analysis of Variance Table
Model 1: Sales_Value ~ Sales_Number
Model 2: Sales_Value ~ Sales_Number + is.weekday + is.bigday + is.weekday:is.bigday
Res.Df RSS Df Sum of Sq F Pr(>F)
1 363 4.6840e+13
2 360 3.9634e+13 3 7.2059e+12 21.817 5.278e-13 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(fit1)
Call:
lm(formula = Sales_Value ~ Sales_Number, data = df2)
Residuals:
Min 1Q Median 3Q Max
-1782242 -138676 -33780 109120 1884361
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -1.066e+05 4.469e+04 -2.386 0.0175 *
Sales_Number 1.401e+02 1.522e+00 92.021 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 359200 on 363 degrees of freedom
Multiple R-squared: 0.9589, Adjusted R-squared: 0.9588
F-statistic: 8468 on 1 and 363 DF, p-value: < 2.2e-16
summary(fit2)
Call:
lm(formula = Sales_Value ~ Sales_Number + is.weekday + is.bigday +
is.weekday:is.bigday, data = df2)
Residuals:
Min 1Q Median 3Q Max
-1856027 -127133 -10140 110408 1535202
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -3.510e+05 8.611e+04 -4.076 5.64e-05 ***
Sales_Number 1.414e+02 2.212e+00 63.925 < 2e-16 ***
is.weekdayTRUE 2.437e+05 5.320e+04 4.580 6.41e-06 ***
is.bigdayTRUE 3.981e+05 1.165e+05 3.417 0.000705 ***
is.weekdayTRUE:is.bigdayTRUE -1.344e+05 1.234e+05 -1.089 0.276952
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 331800 on 360 degrees of freedom
Multiple R-squared: 0.9652, Adjusted R-squared: 0.9648
F-statistic: 2498 on 4 and 360 DF, p-value: < 2.2e-16
autoplot(fit1)

autoplot(fit2)
